RegExpLinksExtractor xref

View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: RegExpLinksExtractor.java,v 1.8 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.extractor;
28  
29  import java.nio.ByteBuffer;
30  import java.nio.CharBuffer;
31  import java.nio.charset.Charset;
32  import java.nio.charset.CharsetDecoder;
33  import java.util.Vector;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  import org.apache.log4j.Logger;
37  import org.smartcrawler.extractor.pattern.AbstractPattern;
38  import org.smartcrawler.retriever.Content;
39  import org.smartcrawler.common.Link;
40  import org.smartcrawler.common.SCLogger;
41  
42  
43  /***
44   *
45   *
46   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
47   * @version <tt>$Revision: 1.8 $</tt>
48   */
49  public class RegExpLinksExtractor implements LinksExtractor {
50  
51      private static Logger log = SCLogger.getLogger(RegExpLinksExtractor.class);
52      private static Logger logExtr = SCLogger.getExtractorLogger();
53  
54      /*** Standard charset.*/
55      private final static Charset charset = Charset.forName("ISO-8859-15");
56  
57      /*** Standard charset decoder.*/
58      private final static CharsetDecoder decoder = charset.newDecoder();
59  
60      /*** The list of the extraction patterns. */
61      private static AbstractPattern[] apatList =
62              PatternProvider.instance().getPatterns();
63  
64      private LinkBuilder linkBuilder;
65  
66      /***
67       * Creates a new instance of RegExpLinksExtractor
68       * @param parsedPageLink
69       */
70      public RegExpLinksExtractor(Link parsedPageLink) {
71          this.linkBuilder = new LinkBuilderImpl(parsedPageLink);
72      }
73  
74      /***
75       *
76       * @param content
77       * @return
78       */
79      public Link[] extract(Content content) {
80          log.debug("extractLinks(): BEGIN");
81          if (content.getContentType().indexOf("htm") < 0){
82              return new Link[0];
83          }
84          byte[] buffer = content.getBuffer();
85          ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
86          bbuf.put(buffer);
87          bbuf.flip();
88  
89          Vector<Link> vect = new Vector<Link>();
90          try {
91              CharBuffer charBuf = decoder.decode(bbuf);
92              //log.debug("extractLinks(): apatList.length = " + apatList.length);
93              for (AbstractPattern apat : apatList) {
94                  log.debug("extractLinks(): [" + apat.getClass().getName()
95                      + "] checking pattern " + apat.getPattern());
96                  Pattern p = apat.getPattern();
97  
98                  Matcher matcher = p.matcher(charBuf);
99                  while(matcher.find()) {
100                     CharSequence cs = matcher.group(apat.getGroupAsInt());
101 
102                     HtmlURL htmlURL = new HtmlURLImpl(cs.toString());
103                     logExtr.info(apat.getClass().getName() +
104                             " " + content.getLink() +
105                             " " + htmlURL.getCleanedLinkAsString());
106 
107                     Link newLink = linkBuilder.buildLink(htmlURL);
108 
109                     if (newLink != null && !vect.contains(newLink)) {
110                         vect.add(newLink);
111                         log.debug("extractLinks(): adding link " + newLink);
112                     }
113                 }
114             }
115         }catch(Exception e) {
116             log.error("extractLinks(): Error extracting links.", e);
117         }
118         log.debug("extractLinks(): found " + vect.size() + " links");
119         Link[] res = new Link[vect.size()];
120         vect.copyInto(res);
121         bbuf.clear();
122 
123         log.debug("extractLinks(): END");
124         return res;
125 
126     }
127 }